1 Authoring with RMarkdown

Source

2 Reference

[1]
M. Scutari and J.-B. Denis, Bayesian networks: With examples in R. Boca Raton, Florida: CRC Press, 2014.

3 A Probabilistic Graphical Model (PGM)

4 Problem statement: Who takes the train to work?

  1. Demographic predictors
  1. Socioeconomic indicators
  1. Prediction target

5 bnlearn - an R package for Bayesian network learning

library(bnlearn)

5.1 Define the Bayesian network

dag <- empty.graph(nodes = c("A", "G", "E", "O", "C", "T"))
dag
## 
##   Random/Generated Bayesian network
## 
##   model:
##    [A][G][E][O][C][T] 
##   nodes:                                 6 
##   arcs:                                  0 
##     undirected arcs:                     0 
##     directed arcs:                       0 
##   average markov blanket size:           0.00 
##   average neighbourhood size:            0.00 
##   average branching factor:              0.00 
## 
##   generation algorithm:                  Empty
dag <- set.arc(dag, from = "A", to = "E")
dag <- set.arc(dag, from = "G", to = "E")
dag <- set.arc(dag, from = "E", to = "O")
dag <- set.arc(dag, from = "E", to = "C")
dag <- set.arc(dag, from = "O", to = "T")
dag <- set.arc(dag, from = "C", to = "T")
dag
## 
##   Random/Generated Bayesian network
## 
##   model:
##    [A][G][E|A:G][O|E][C|E][T|O:C] 
##   nodes:                                 6 
##   arcs:                                  6 
##     undirected arcs:                     0 
##     directed arcs:                       6 
##   average markov blanket size:           2.67 
##   average neighbourhood size:            2.00 
##   average branching factor:              1.00 
## 
##   generation algorithm:                  Empty

5.2 A faster/alternative way to define the Bayesian network

dag2 <- empty.graph(nodes = c("A", "G", "E", "O", "C", "T"))

arc.set <- matrix(c("A", "E",
      "G", "E",
      "E", "O",
      "E", "C",
      "O", "T",
      "C", "T"),
    byrow = TRUE, ncol = 2,
    dimnames = list(NULL, c("from", "to")))

arcs(dag2) <- arc.set
dag2
## 
##   Random/Generated Bayesian network
## 
##   model:
##    [A][G][E|A:G][O|E][C|E][T|O:C] 
##   nodes:                                 6 
##   arcs:                                  6 
##     undirected arcs:                     0 
##     directed arcs:                       6 
##   average markov blanket size:           2.67 
##   average neighbourhood size:            2.00 
##   average branching factor:              1.00 
## 
##   generation algorithm:                  Empty

5.3 Both approaches result in the same DAG

all.equal(dag, dag2)
## [1] TRUE

5.4 What happens when you insert a cycle into DAG?

try(set.arc(dag, from = "T", to = "E"))
## Error in arc.operations(x = x, from = from, to = to, op = "set", check.cycles = check.cycles,  : 
##   the resulting graph contains cycles.

6 Exploring the bn object

nodes(dag)
## [1] "A" "G" "E" "O" "C" "T"
arcs(dag)
##      from to 
## [1,] "A"  "E"
## [2,] "G"  "E"
## [3,] "E"  "O"
## [4,] "E"  "C"
## [5,] "O"  "T"
## [6,] "C"  "T"

7 Joint probability as a product of conditional probabilitiies

Use the modelstring function to generate the product of conditional probabilities.

modelstring(dag)
## [1] "[A][G][E|A:G][O|E][C|E][T|O:C]"

8 Factorization

\[\operatorname{Pr}(\mathrm{A}, \mathrm{G}, \mathrm{E}, 0, \mathrm{C}, \mathrm{T})=\operatorname{Pr}(\mathrm{A}) \operatorname{Pr}(\mathrm{G}) \operatorname{Pr}(\mathrm{E} \mid \mathrm{A}, \mathrm{G}) \operatorname{Pr}(0 \mid \mathrm{E}) \operatorname{Pr}(\mathrm{C} \mid \mathrm{E}) \operatorname{Pr}(\mathrm{T} \mid 0, \mathrm{C})\]

9 Specify joint probability distribution over the random variables

A.lv <- c("young", "adult", "old") # age
G.lv <- c("M", "F") # gender
E.lv <- c("high", "uni") # education
O.lv <- c("emp", "self") # occupation
C.lv <- c("small", "big") # city of residence
T.lv <- c("car", "train", "other") # mode of transport

Marginal probability distributions for Age and Gender variables:

A.prob <- array(c(0.30, 0.50, 0.20), dim = 3, 
                dimnames = list(A = A.lv))
A.prob
## A
## young adult   old 
##   0.3   0.5   0.2
G.prob <- array(c(0.60, 0.40), dim = 2, 
                dimnames = list(G = G.lv))
G.prob
## G
##   M   F 
## 0.6 0.4

Conditional probability distributions for Occupation and City of residence variables:

O.prob <- array(c(0.96, 0.04, 0.92, 0.08), dim = c(2, 2),
            dimnames = list(O = O.lv, E = E.lv))
O.prob
##       E
## O      high  uni
##   emp  0.96 0.92
##   self 0.04 0.08
C.prob <- array(c(0.25, 0.75, 0.20, 0.80), dim = c(2, 2),
            dimnames = list(C = C.lv, E = E.lv))
C.prob
##        E
## C       high uni
##   small 0.25 0.2
##   big   0.75 0.8

Alternative way to specify probability distributions for one- and two-dimensional distributions:

C.prob <- matrix(c(0.25, 0.75, 0.20, 0.80), ncol = 2,
            dimnames = list(C = C.lv, E = E.lv))
C.prob
##        E
## C       high uni
##   small 0.25 0.2
##   big   0.75 0.8

Conditional Probability Tables (CPTs) for Education and Transport are modeled using three-dimensional tables:

E.prob <- array(c(0.75, 0.25, 0.72, 0.28, 0.88, 0.12, 0.64,
            0.36, 0.70, 0.30, 0.90, 0.10), dim = c(2, 3, 2),
            dimnames = list(E = E.lv, A = A.lv, G = G.lv))
E.prob
## , , G = M
## 
##       A
## E      young adult  old
##   high  0.75  0.72 0.88
##   uni   0.25  0.28 0.12
## 
## , , G = F
## 
##       A
## E      young adult old
##   high  0.64   0.7 0.9
##   uni   0.36   0.3 0.1
T.prob <- array(c(0.48, 0.42, 0.10, 0.56, 0.36, 0.08, 0.58,
            0.24, 0.18, 0.70, 0.21, 0.09), dim = c(3, 2, 2),
            dimnames = list(T = T.lv, O = O.lv, C = C.lv))

The joint probability distribution has 143 paramters, whereas the DAG approach has only 21 parameters.

10 Fully-specified Bayesian network

Has two components: 1. DAG 2. Marginal and conditional probability tables

We can recreate the DAG using the model2network function:

dag3 <- model2network("[A][G][E|A:G][O|E][C|E][T|O:C]")
all.equal(dag, dag3)
## [1] TRUE

11 bn.fit class

We can combine our DAG defined earlier with CPTs to create an object of the bn.fit class:

cpt <- list(A = A.prob, G = G.prob, E = E.prob, O = O.prob, 
        C = C.prob, T = T.prob)

bn <- custom.fit(dag, cpt)

Number of parameters of the model:

nparams(bn)
## [1] 21

Number of edges in the model:

arcs(bn)
##      from to 
## [1,] "A"  "E"
## [2,] "G"  "E"
## [3,] "E"  "O"
## [4,] "E"  "C"
## [5,] "O"  "T"
## [6,] "C"  "T"

Print the CPT of the C random variable:

bn$C
## 
##   Parameters of node C (multinomial distribution)
## 
## Conditional probability table:
##  
##        E
## C       high  uni
##   small 0.25 0.20
##   big   0.75 0.80

Extract the values in the CPT for later use:

C.cpt <- coef(bn$C)

Print all CPTs in the model:

bn
## 
##   Bayesian network parameters
## 
##   Parameters of node A (multinomial distribution)
## 
## Conditional probability table:
##  A
## young adult   old 
##   0.3   0.5   0.2 
## 
##   Parameters of node G (multinomial distribution)
## 
## Conditional probability table:
##  G
##   M   F 
## 0.6 0.4 
## 
##   Parameters of node E (multinomial distribution)
## 
## Conditional probability table:
##  
## , , G = M
## 
##       A
## E      young adult  old
##   high  0.75  0.72 0.88
##   uni   0.25  0.28 0.12
## 
## , , G = F
## 
##       A
## E      young adult  old
##   high  0.64  0.70 0.90
##   uni   0.36  0.30 0.10
## 
## 
##   Parameters of node O (multinomial distribution)
## 
## Conditional probability table:
##  
##       E
## O      high  uni
##   emp  0.96 0.92
##   self 0.04 0.08
## 
##   Parameters of node C (multinomial distribution)
## 
## Conditional probability table:
##  
##        E
## C       high  uni
##   small 0.25 0.20
##   big   0.75 0.80
## 
##   Parameters of node T (multinomial distribution)
## 
## Conditional probability table:
##  
## , , C = small
## 
##        O
## T        emp self
##   car   0.48 0.56
##   train 0.42 0.36
##   other 0.10 0.08
## 
## , , C = big
## 
##        O
## T        emp self
##   car   0.58 0.70
##   train 0.24 0.21
##   other 0.18 0.09

12 Estimating the model parameters from an observed sample

survey <- read.table("data/survey.txt", header = TRUE)
class(survey)
## [1] "data.frame"
str(survey)
## 'data.frame':    500 obs. of  6 variables:
##  $ A: chr  "adult" "adult" "adult" "adult" ...
##  $ C: chr  "big" "small" "big" "big" ...
##  $ E: chr  "high" "uni" "uni" "high" ...
##  $ O: chr  "emp" "emp" "emp" "emp" ...
##  $ G: chr  "F" "M" "F" "M" ...
##  $ T: chr  "car" "car" "train" "car" ...
names(survey)
## [1] "A" "C" "E" "O" "G" "T"
class(survey$A)
## [1] "character"
survey[] <- lapply( survey, factor)
str(survey)
## 'data.frame':    500 obs. of  6 variables:
##  $ A: Factor w/ 3 levels "adult","old",..: 1 1 1 1 1 1 1 3 3 2 ...
##  $ C: Factor w/ 2 levels "big","small": 1 2 1 1 1 2 1 1 1 1 ...
##  $ E: Factor w/ 2 levels "high","uni": 1 2 2 1 1 1 1 2 1 2 ...
##  $ O: Factor w/ 2 levels "emp","self": 1 1 1 1 1 1 1 1 1 1 ...
##  $ G: Factor w/ 2 levels "F","M": 1 2 1 2 2 1 1 1 2 1 ...
##  $ T: Factor w/ 3 levels "car","other",..: 1 1 3 1 1 3 1 3 1 1 ...
class(survey$A)
## [1] "factor"
head(survey)
##       A     C    E   O G     T
## 1 adult   big high emp F   car
## 2 adult small  uni emp M   car
## 3 adult   big  uni emp F train
## 4 adult   big high emp M   car
## 5 adult   big high emp M   car
## 6 adult small high emp F train

Estimate parameters with the corresponding empirical frequencies in the data – classic frequentist and maximum likelihood estimates (MLE).

For example, to estimate \(\operatorname{Pr}}(0=&\text { emp } \mid \mathrm{E}=\mathrm{high})\):

\[\begin{aligned} \widehat{\operatorname{Pr}}(0=&\text { emp } \mid \mathrm{E}=\mathrm{high}) = \frac{\widehat{\operatorname{Pr}}(0=\mathrm{emp}, \mathrm{E}=\mathrm{high})}{\widehat{\operatorname{Pr}}(\mathrm{E}=\mathrm{high})} \\ &=\frac{\text { # } (0=\text { emp and } \mathrm{E}=\mathrm{high})}{\text { # } (\mathrm{E}=\mathrm{high})} \end{aligned}\]

In bnlearn, we compute MLE using the bn.fit function.

options(digits = 3)
bn.mle <- bn.fit(dag, data = survey, method = "mle")

The bn.fit() returns an object of class bn.fit

Note: custom.fit() and bn.fit() are complementary.custom.fit() constructs a BN using a set of custom parameters specified by the user, whereas bn.fit() estimates the parameters from the sampled data.

We can also compute the parameters manually using the sampled data:

prop.table(table(survey[, c("O", "E")]), margin = 2)
##       E
## O        high    uni
##   emp  0.9808 0.9259
##   self 0.0192 0.0741

Verify that the above numbers agree with the corresponding numbers computed using the bn.fit()

bn.mle$O
## 
##   Parameters of node O (multinomial distribution)
## 
## Conditional probability table:
##  
##       E
## O        high    uni
##   emp  0.9808 0.9259
##   self 0.0192 0.0741

We can also estimate the CPTs using their posterior distributions from a uniform prior over each CPT.

bn.bayes <- bn.fit(dag, data = survey, method = "bayes", 
              iss = 10)
bn.bayes$O
## 
##   Parameters of node O (multinomial distribution)
## 
## Conditional probability table:
##  
##       E
## O        high    uni
##   emp  0.9743 0.9107
##   self 0.0257 0.0893

Posterior estimates are farther from both 0 and 1 than the corresponding MLE estimates – prior distribution influence.

Entails advantages:

  1. Regularization conditions of model estimation and inference methods are met.
  2. No sparse CPTs.
  3. Due to robustness in estimation, resulting BNs will have better predictive power.

What happens when we increase the value of iss?

Posterior distribution heads towards the uniform distribution (used as the prior).

bn.bayes <- bn.fit(dag, data = survey, method = "bayes", 
              iss = 20)
bn.bayes$O
## 
##   Parameters of node O (multinomial distribution)
## 
## Conditional probability table:
##  
##       E
## O       high   uni
##   emp  0.968 0.897
##   self 0.032 0.103

13 Learning the DAG structure

A complex task: 1. The number of DAGs increases super-exponentially as the number of nodes grows – not practical to investigate all. 2. The space of possible DAGs is different from real spaces (e.g., \(\mathbb{R}, \mathbb{R}^2\)) – non-continuous with a finite number of elements. Exploration requires ad-hoc algorithms.

Statistical criteria for evaluating DAGs: 1. Conditional independence tests 2. Network scores

13.1 Conditional independence tests

  • Each edge/arc encodes a probabilistic dependence.
  • Conditional independence tests assess whether the dependence is supported by the data.
  • Consider the edge for inclusion in the DAG if the null hypothesis (of conditional independence) is rejected.
  • Consider the edge \(E \longrightarrow T)\)
  • The null hypothesis is that Travel is probabilistically independent from the Education conditional on its parents: \(H_{0}: \mathrm{T} \Perp_{P} \mathrm{E} \mid\{0, \mathrm{C}\}\)
  • Alternative hypothesis: \(H_{1}: \mathrm{T} \not \Lambda_{P} \mathrm{E} \mid\{0, \mathrm{C}\}\)

Test the null hypothesis using the log-likelihood ratio \(G^{2}\) or Pearson’s \(\chi^{2}\) to test for conditional independence (instead of marginal independence).

  • For \(G^{2}\), the test statistic is of the form:

\[G^{2}(\mathrm{~T}, \mathrm{E} \mid 0, \mathrm{C})=\sum_{t \in \mathrm{T}} \sum_{e \in \mathrm{E}} \sum_{k \in \mathrm{O} \times \mathrm{C}} {n_{t e k}} \log \frac{n_{t e k} n_{++k}}{n_{t+k} n_{+e k}}\]

Notation: \(n_{++k}\) denotes the number of observations for \(k\) obtained by summing over all categories of travel mode (\(T\)) and education (\(E\)).

For Pearson’s \(\chi^{2}\),

\[\chi^{2}(\mathrm{~T}, \mathrm{E} \mid 0, \mathrm{R})=\sum_{t \in \mathrm{T}} \sum_{e \in \mathrm{E}} \sum_{k \in 0 \times \mathrm{R}} \frac{\left(n_{t e k}-m_{t e k}\right)^{2}}{m_{t e k}}\] where where \(m_{t e k}=\frac{n_{t+k} n_{+e k}}{n_{++k}}\)

Both tests have an asymptotic \(\chi^2\) distribution under the null hypothesis.

Degrees of freedom:

(nlevels(survey[, "T"]) - 1) * (nlevels(survey[, "E"]) - 1) * 
  (nlevels(survey[, "O"]) * nlevels(survey[, "C"]))
## [1] 8

The ci.test() of bnlearn implements both \(G^2\) and \(\chi^2\) tests. \(G^2\) test is equivalent to the mutual information test from information theory.

ci.test("T", "E", c("O", "C"), test = "mi", data = survey)
## 
##  Mutual Information (disc.)
## 
## data:  T ~ E | O + C
## mi = 10, df = 8, p-value = 0.3
## alternative hypothesis: true value is greater than 0

\(\chi^2\) test:

ci.test("T", "E", c("O", "C"), test = "x2", data = survey)
## 
##  Pearson's X^2
## 
## data:  T ~ E | O + C
## x2 = 8, df = 8, p-value = 0.4
## alternative hypothesis: true value is greater than 0

Automate the test of significance using the src.strength() function.

options(digits = 2)
arc.strength(dag, data = survey, criterion = "x2")
##   from to strength
## 1    A  E  0.00098
## 2    G  E  0.00125
## 3    E  O  0.00264
## 4    E  C  0.00056
## 5    O  T  0.43391
## 6    C  T  0.00136

All edges except the one from \(O\) to \(T\) have p-values smaller than 0.05 and are well supported by the data.

13.2 Network scores

Network scores focus on the DAG as a whole. Bayesian Information criterion (BIC) is one such score.

\(\begin{aligned} \mathrm{BIC}= \log \widehat{\operatorname{Pr}}(\mathrm{A}, \mathrm{G}, \mathrm{E}, 0, \mathrm{C}, \mathrm{T}) - \frac{d}{2} \log n=\\ \left[\log \widehat{\operatorname{Pr}}(\mathrm{A})-\frac{d_{\mathrm{A}}}{2} \log n\right]+\left[\log \widehat{\operatorname{Pr}}(\mathrm{G})-\frac{d_{\mathrm{G}}}{2} \log n\right]+\\+\left[\log \widehat{\operatorname{Pr}}(\mathrm{E} \mid \mathrm{A}, \mathrm{G})-\frac{d_{\mathrm{E}}}{2} \log n\right]+\left[\log \widehat{\operatorname{Pr}}(0 \mid \mathrm{E})-\frac{d_{0}}{2} \log n\right]+\\+\left[\log \widehat{\operatorname{Pr}}(\mathrm{C} \mid \mathrm{E})-\frac{d_{\mathrm{C}}}{2} \log n\right]+\left[\log \widehat{\operatorname{Pr}}(\mathrm{T} \mid 0, \mathrm{C})-\frac{d_{\mathrm{T}}}{2} \log n\right] \end{aligned}\)

  • \(n\) is the sample size, \(d\) is the number of parameters of the whole network.
  • \(d_{\mathrm{A}}, d_{\mathrm{S}}, d_{\mathrm{E}}, d_{0}, d_{\mathrm{R}}\) and \(d_{\mathrm{T}}\) are the numbers of parameters associated with each node.

Bayesian Dirichlet equivalent uniform (BDeu) posterior probability.

BIC and BDe assign higher scores to DAGs that fit the data better.

set.seed(456)
options(digits = 6)
score(dag, data = survey, type = "bic")
## [1] -2012.69
score(dag, data = survey, type = "bde", iss = 10)
## [1] -1998.28
score(dag, data = survey, type = "bde", iss = 1)
## [1] -2015.65

Compute scores before and after adding the edge \(E \longrightarrow T\)

dag4 <- set.arc(dag, from = "E", to = "T")
nparams(dag4, survey)
## [1] 29
score(dag4, data = survey, type = "bic")
## [1] -2032.6

\(E \longrightarrow T\) does not help.

Scores can also be used to compare completely different networks.

rnd <- random.graph(nodes = c("A", "G", "E", "O", "C", "T"))
modelstring(rnd)
## [1] "[A][G|A][E|A:G][O|G:E][C|G:E][T|G:E]"
score(rnd, data = survey, type = "bic")
## [1] -2034.99

Several algorithms for structure learning. One such is hill-climbing algorithm.

learned <- hc(survey)
modelstring(learned)
## [1] "[C][E|C][T|C][A|E][O|E][G|E]"
score(learned, data = survey, type = "bic")
## [1] -1998.43

Change the default score = “bic” to score = “bde”

learned2 <- hc(survey, score = "bde")

The arc.strength() function reports the change in the score due to an edge/arc removal as the arc’s strength when criterion is a network score.

options(digits=3)
arc.strength(learned, data = survey, criterion = "bic")
##   from to strength
## 1    C  E   -3.390
## 2    E  G   -2.726
## 3    C  T   -1.848
## 4    E  A   -1.720
## 5    E  O   -0.827
arc.strength(dag, data = survey, criterion = "bic")
##   from to strength
## 1    A  E    2.489
## 2    G  E    1.482
## 3    E  O   -0.827
## 4    E  C   -3.390
## 5    O  T   10.046
## 6    C  T    2.973

14 Answering questions using discrete BNs

  1. Conditional independence queries (whether a variable is associated to another)
  2. Conditional probability queries (distribution of one or more variables under non-trivial conditioning)
  3. Most likely explanation queries (most likely outcome of one or more variables under non-trivial conditioning)
options(digits = 3)
dsep(dag, x = "G", y = "C")
## [1] FALSE
dsep(dag, x = "O", y = "C")
## [1] FALSE
path(dag, from = "G", to = "C")
## [1] TRUE
dsep(dag, x = "G", y = "C", z = "E")
## [1] TRUE
dsep(dag, x = "O", y = "C", z = "E")
## [1] TRUE
dsep(dag, x = "A", y = "G")
## [1] TRUE
dsep(dag, x = "A", y = "G", z = "E")
## [1] FALSE

14.1 Exact inference

Implemented in package gRain (gRaphical model inference).

Transforms the BN into a junction tree to speed up the computation of conditional probabilities.

library(gRain)
junction <- compile(as.grain(bn))
options(digits = 4)
querygrain(junction, nodes = "T")$T
## T
##    car  train  other 
## 0.5618 0.2809 0.1573
jgender <- setEvidence(junction, nodes = "G", states = "F")
querygrain(jgender, nodes = "T")$T
## T
##    car  train  other 
## 0.5621 0.2806 0.1573
jres <- setEvidence(junction, nodes = "R", states = "small")
querygrain(jres, nodes = "T")$T
## T
##    car  train  other 
## 0.5618 0.2809 0.1573
jedu <- setEvidence(junction, nodes = "E", states = "high")
GxT.cpt <- querygrain(jedu, nodes = c("G", "T"), type = "joint")
GxT.cpt
##    T
## G      car  train   other
##   M 0.3427 0.1737 0.09623
##   F 0.2167 0.1098 0.06087
querygrain(jedu, nodes = c("G", "T"), type = "marginal")
## $G
## G
##      M      F 
## 0.6126 0.3874 
## 
## $T
## T
##    car  train  other 
## 0.5594 0.2835 0.1571
querygrain(jedu, nodes = c("G", "T"), type = "conditional")
##    T
## G      car  train  other
##   M 0.6126 0.6126 0.6126
##   F 0.3874 0.3874 0.3874
dsep(bn, x = "G", y = "T", z = "E")
## [1] TRUE
GxT.ct = GxT.cpt * nrow(survey)
chisq.test(GxT.ct)
## 
##  Pearson's Chi-squared test
## 
## data:  GxT.ct
## X-squared = 2.3e-30, df = 2, p-value = 1
set.seed(123)
cpquery(bn, event = (G == "M") & (T == "car"), 
          evidence = (E == "high"))
## [1] 0.3516
cpquery(bn, event = (G == "M") & (T == "car"), 
            evidence = (E == "high"), n = 10^6)
## [1] 0.3432
set.seed(567)
cpquery(bn, event = (G == "M") & (T == "car"),
            evidence = list(E = "high"), method = "lw")
## [1] 0.3422
set.seed(123)
cpquery(bn, event = (G == "M") & (T == "car"),
  evidence = ((A == "young") & (E == "uni")) | (A == "adult"))
## [1] 0.3352
GxT <- cpdist(bn, nodes = c("G", "T"),
         evidence = (E == "high"))
head(GxT)
##   G     T
## 1 M   car
## 2 M   car
## 3 M   car
## 4 M other
## 5 M   car
## 6 F other
options(digits = 3)
prop.table(table(GxT))
##    T
## G      car  train  other
##   M 0.3509 0.1833 0.1034
##   F 0.1969 0.1042 0.0613
graphviz.plot(dag)

hlight <- list(nodes = nodes(dag), arcs = arcs(dag), 
                  col = "grey", textCol = "grey")
pp <- graphviz.plot(dag, highlight = hlight)

# edgeRenderInfo() from the Rgraphviz package
# edgeRenderInfo() modifies how the arcs are formatted
library(Rgraphviz)
str(edgeRenderInfo(pp))
## List of 13
##  $ enamesFrom: Named chr [1:6] "A" "G" "E" "E" ...
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ enamesTo  : Named chr [1:6] "E" "E" "O" "C" ...
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ splines   :List of 6
##   ..$ A~E:List of 1
##   .. ..$ :Formal class 'BezierCurve' [package "Rgraphviz"] with 1 slot
##   .. .. .. ..@ cPoints:List of 4
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 129
##   .. .. .. .. .. .. ..@ y: int 314
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 154
##   .. .. .. .. .. .. ..@ y: int 297
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 193
##   .. .. .. .. .. .. ..@ y: int 270
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 221
##   .. .. .. .. .. .. ..@ y: int 251
##   ..$ G~E:List of 1
##   .. ..$ :Formal class 'BezierCurve' [package "Rgraphviz"] with 1 slot
##   .. .. .. ..@ cPoints:List of 4
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 374
##   .. .. .. .. .. .. ..@ y: int 314
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 349
##   .. .. .. .. .. .. ..@ y: int 297
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 310
##   .. .. .. .. .. .. ..@ y: int 270
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 282
##   .. .. .. .. .. .. ..@ y: int 251
##   ..$ E~O:List of 1
##   .. ..$ :Formal class 'BezierCurve' [package "Rgraphviz"] with 1 slot
##   .. .. .. ..@ cPoints:List of 4
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 230
##   .. .. .. .. .. .. ..@ y: int 214
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 205
##   .. .. .. .. .. .. ..@ y: int 197
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 166
##   .. .. .. .. .. .. ..@ y: int 170
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 138
##   .. .. .. .. .. .. ..@ y: int 151
##   ..$ E~C:List of 1
##   .. ..$ :Formal class 'BezierCurve' [package "Rgraphviz"] with 1 slot
##   .. .. .. ..@ cPoints:List of 4
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 273
##   .. .. .. .. .. .. ..@ y: int 214
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 298
##   .. .. .. .. .. .. ..@ y: int 197
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 337
##   .. .. .. .. .. .. ..@ y: int 170
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 365
##   .. .. .. .. .. .. ..@ y: int 151
##   ..$ O~T:List of 1
##   .. ..$ :Formal class 'BezierCurve' [package "Rgraphviz"] with 1 slot
##   .. .. .. ..@ cPoints:List of 4
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 129
##   .. .. .. .. .. .. ..@ y: int 114
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 154
##   .. .. .. .. .. .. ..@ y: int 97
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 193
##   .. .. .. .. .. .. ..@ y: int 70
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 221
##   .. .. .. .. .. .. ..@ y: int 51
##   ..$ C~T:List of 1
##   .. ..$ :Formal class 'BezierCurve' [package "Rgraphviz"] with 1 slot
##   .. .. .. ..@ cPoints:List of 4
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 374
##   .. .. .. .. .. .. ..@ y: int 114
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 349
##   .. .. .. .. .. .. ..@ y: int 97
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 310
##   .. .. .. .. .. .. ..@ y: int 70
##   .. .. .. .. ..$ :Formal class 'xyPoint' [package "Rgraphviz"] with 2 slots
##   .. .. .. .. .. .. ..@ x: int 282
##   .. .. .. .. .. .. ..@ y: int 51
##  $ labelX    : Named num [1:6] NA NA NA NA NA NA
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ labelY    : Named num [1:6] NA NA NA NA NA NA
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ labelJust : Named chr [1:6] NA NA NA NA ...
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ labelWidth: Named num [1:6] NA NA NA NA NA NA
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ arrowhead : Named chr [1:6] "open" "open" "open" "open" ...
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ arrowtail : Named chr [1:6] "none" "none" "none" "none" ...
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ direction : Named chr [1:6] "forward" "forward" "forward" "forward" ...
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ col       : Named chr [1:6] "grey" "grey" "grey" "grey" ...
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ lty       : Named chr [1:6] "solid" "solid" "solid" "solid" ...
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
##  $ lwd       : Named num [1:6] 1 1 1 1 1 1
##   ..- attr(*, "names")= chr [1:6] "A~E" "G~E" "E~O" "E~C" ...
edgeRenderInfo(pp) <- 
  list(col = c("G~E" = "black", "E~C" = "black"),
       lwd = c("G~E" = 3, "E~C" = 3))
# nodeRenderInfo() -- modifies how the nodes are formatted
nodeRenderInfo(pp) <- 
  list(col = c("G" = "black", "E" = "black", "C" = "black"),
    textCol = c("G" = "black", "E" = "black", "C" = "black"),
    fill = c("E" = "grey"))
renderGraph(pp)

bn.fit.barchart(bn.mle$T, main = "Travel", 
  xlab = "Pr(T | C,O)", ylab = "")

Evidence <- 
  factor(c(rep("Unconditional",3), rep("Female", 3), 
           rep("Small City",3)),
         levels = c("Unconditional", "Female", "Small City"))

Travel <- factor(rep(c("car", "train", "other"), 3),
           levels = c("other", "train", "car"))

distr <- data.frame(Evidence = Evidence, Travel = Travel,
           Prob = c(0.5618, 0.2808, 0.15730, 0.5620, 0.2806,
                    0.1573, 0.4838, 0.4170, 0.0990))
head(distr)
##        Evidence Travel  Prob
## 1 Unconditional    car 0.562
## 2 Unconditional  train 0.281
## 3 Unconditional  other 0.157
## 4        Female    car 0.562
## 5        Female  train 0.281
## 6        Female  other 0.157
# barchart() is defined in R lattice package
library(lattice)
barchart(Travel ~ Prob | Evidence, data = distr,
   layout = c(3, 1), xlab = "Probability",
   scales = list(alternating = 1, tck = c(1, 0)),
   strip = strip.custom(factor.levels =
     c(expression(Pr(T)),
       expression(Pr({T} * " | " * {G == F})),
       expression(Pr({T} * " | " * {C == small})))),
   panel = function(...) {
     panel.barchart(...)
     panel.grid(h = 0, v = -1)
   })